

#################################
#################################
## Gene set analysis module
#################################
#################################
run.gene.set.analyses = function(DEresults,prb.sets,prb.set.names,deregscores=NULL,saveresults=TRUE,pval.adjustment,
                                 log,plottypearg,path.results,path.inc,path.to.csvs,path.to.GSA.results,prb.annots)
{
  print ("Starting gene set analysis (GSA)")
  cat("LOG:Starting gene set analysis (GSA)",file=log,sep='\n\n',append=TRUE)
  cat("document.write('<p>Starting gene set analysis (GSA)</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)

  # remove columns of gene set matrix that are empty:
  empty.prb.sets = colSums(prb.sets)==0
  #gene.sets = gene.sets[,!empty.gene.sets]
  #gene.set.names = gene.set.names[!empty.gene.sets]
 
  nvars = DEresults$npredictors
  pvals=as.data.frame(DEresults$pvals[,1:nvars]); dimnames(pvals)[[2]]=dimnames(DEresults$pvals)[[2]][1:nvars]
  ests=as.data.frame(DEresults$ests[,1:nvars]); dimnames(ests)[[2]]=dimnames(DEresults$ests)[[2]][1:nvars]
  ngenes = dim(DEresults$pvals)[1]
  newtermnames = DEresults$newtermnames
  newtermnames.linebreak = DEresults$newtermnames.linebreak
  # revise prb.sets so the genes match those of DEresults:
  prb.sets = prb.sets[match(dimnames(pvals)[[1]],dimnames(prb.sets)[[1]]),]
  K = dim(prb.sets)[2]
  covariates = dimnames(DEresults$ests)[[2]][1:nvars]
  
  # matrices of statistics to save:
  global.sig.stats = matrix(NA,dim(prb.sets)[2],length(covariates))
  dimnames(global.sig.stats)[[1]]=prb.set.names
  dimnames(global.sig.stats)[[2]]=newtermnames[covariates]
  global.sig.stats.updown = global.sig.stats

  
  for(k in 1:K)
  {
    print(paste("Creating gene set analysis (GSA) plots for gene set ",k,"/", K, sep=""))
    cat(paste("LOG:Creating gene set analysis (GSA) plots for gene set ",k,"/", K, sep=""),file=log,sep='\n\n',append=TRUE)
    cat(paste("document.write('<p>Creating gene set analysis (GSA) plots for gene set ",k,"/", K, "</p>');", sep=""), file=paste(path.inc,"//status.js",sep=""),append=TRUE)
    
    # for each covariate saved in DEresults
    for(i in 1:nvars)
    {

      # pvals.all = pvals[,i]
      # tstats.all = ests[,i]/DEresults$ses[,i]
      # pvals.path = pvals.all[prb.sets[,k]==1]

      diffExpResults             <- DEresults$out.df.list[[i]]  #[,c("Log2 fold change", "geneName", "Accession.Number")] 
      diffExpResults$tstats.all  <- diffExpResults[,"Log2 fold change"]/diffExpResults[,"std error"]

      diffExpResults.path        <- diffExpResults[match(rownames(prb.sets)[prb.sets[,k] == 1],diffExpResults$probe.ID),]
      
      # calculate global significance stat:

      # round(sqrt(mean((tstats.all^2)[prb.sets[,k]==1])),4)
      global.sig.stats[k,i] = round(sqrt(mean(diffExpResults.path[,"tstats.all"]^2)),4)

      # mean((sign(tstats.all)*(tstats.all^2))[prb.sets[,k]==1])
      global.sig.stats.updown[k,i] = mean(sign(diffExpResults.path[,"tstats.all"])*diffExpResults.path[,"tstats.all"]^2)

      global.sig.stats.updown[k,i] = round(sign(global.sig.stats.updown[k,i])*sqrt(abs(global.sig.stats.updown[k,i])),4)

      if(is.element(DEresults$pval.adjustment,c("BH","BY","Bonf"))) fdr.cutoffs <- DEresults$fdr.cutoff.list[[i]]

      unfilled.Shapes <- c(1,2,0,5)
      filled.Shapes   <- c(16,17,15,18)

      if(saveresults)
      {
        # draw a volcano plot
        for(r in 1:length(plottypearg)){
          plottype=plottypearg[r];
          
          #tempfilename = drawplot(paste(path.to.GSA.results,"//volcano plot - ",make.names(covariates[i])," - ",prb.set.names[k],sep=""),plottype,width=1.3,height=1.3)
          #tempfilename = drawplot(paste(path.to.GSA.results,"//volcano plot - ",make.names(covariates[i])," - ",gsub("/", "-", prb.set.names[k], ignore.case = FALSE, perl = F, fixed = T),sep=""),plottype,width=1.3,height=1.3)
          tempfilename = drawplot(paste(path.to.GSA.results,"//volcano plot - ",make.names(covariates[i])," - ",make.names(prb.set.names[k]),sep=""),plottype,width=1.3,height=1.3)
          tempfilename=gsub(path.results,"results",tempfilename)
          
#          if(!is.element(DEresults$pval.adjustment,c("BH","BY","Bonf")))
#          {
#            # diffExpResults.path$"volcanocol" = rep("grey",nrow(diffExpResults.path))
#            diffExpResults.path$"volcanocol" = codecols2[3]
#
#            # diffExpResults[,"volcanopch"] = rep(1,ngenes)
#            diffExpResults$"volcanopch" = 16
#          }

          par(xpd=TRUE)
          par(mar=c(5,4,5,1))

          ### draw the volcano plot:
          for(l in unique(diffExpResults$Analyte.Type))
          {
            if(which(unique(diffExpResults$Analyte.Type) == l) == 1)
            {
              ttl.i <- newtermnames[covariates[i]]
              tmp <- unlist(strsplit(ttl.i,split = " "))
              if(length(tmp)>3)
                ttl.i <- paste(paste(tmp[1:3],collapse=" "),"\n",paste(tmp[-(1:3)],collapse = " "),sep="")
              
              plot(diffExpResults[diffExpResults[,"Analyte.Type"] == l,"Log2 fold change"], -log10(diffExpResults[diffExpResults[,"Analyte.Type"] == l,"P-value"]), 
                   xlab = "log2(fold change)", ylab = "-log10(p-value)",main=ttl.i, cex.main = 1,
                   xlim = c(min(diffExpResults[,"Log2 fold change"]), max(diffExpResults[,"Log2 fold change"])), 
                   ylim = c(min(-log10(diffExpResults[,"P-value"])), max(-log10(diffExpResults[,"P-value"]))),
                   col = "azure4", #diffExpResults[diffExpResults[,"Analyte.Type"] == l,"volcanocol"], 
                   pch = diffExpResults[diffExpResults[,"Analyte.Type"] == l,"volcanopch"])
            } else{
              points(diffExpResults[diffExpResults[,"Analyte.Type"] == l,"Log2 fold change"], -log10(diffExpResults[diffExpResults[,"Analyte.Type"] == l,"P-value"]), 
                     col = "azure4", #diffExpResults[diffExpResults[,"Analyte.Type"] == l,"volcanocol"],
                     pch = diffExpResults[diffExpResults[,"Analyte.Type"] == l,"volcanopch"])
            }
          }

          # draw lines for pval cutoffs if there's no FDR calculated:
          if(!is.element(pval.adjustment,c("BH","BY", "Bonf")))
          {
            #legend("bottomright",pch=c(16,16),col=c(codecols2[1],codecols2[2]),legend = c("p < 0.01","p < 0.001"))
            legend("topleft",bty="n",inset=c(0,-0.16), lty=1:2, legend = c("p-value < 0.01","p-value < 0.001"), cex = 1)
            legend("topright",bty="n",inset=c(0,-0.16), legend = c(unique(diffExpResults$Analyte.Type), "probe set"), 
              pch = c(filled.Shapes[1:length(unique(diffExpResults$Analyte.Type))], 16), 
              col = c("azure4","azure4",codecols2[3]), #color.Analyte[1:length(unique(out.df$Analyte.Type))], 
              cex = 1)
          }

          if(!is.element(pval.adjustment,c("BH","BY", "Bonf")))
          {
            par(xpd = F)
            abline(h=-log10(c(0.01,0.001)),lty= c(1:2))
          }

          # draw lines for FDR cutoffs if FDR has been calculated:
          if(is.element(pval.adjustment,c("BH","BY", "Bonf")))
          {
            #legend("bottomright",pch=c(16,16),col=c(codecols2[1],codecols2[2]),legend = c("p < 0.01","p < 0.001"))
            legend("topleft",bty="n",inset=c(0,-0.16),lty=1:4,legend = paste("adj. p-value <",c("0.01","0.05","0.10","0.50")), cex = 1)
            legend("topright",bty="n",inset=c(0,-0.16), legend = c(unique(diffExpResults$Analyte.Type), "probe set"), 
              pch = c(filled.Shapes[1:length(unique(diffExpResults$Analyte.Type))], 15), 
              col = c("azure4","azure4","red"), #color.Analyte[1:length(unique(out.df$Analyte.Type))], 
              cex = 1)
          }

          # draw lines for FDR cutoffs if FDR has been calculated:
          if(is.element(pval.adjustment,c("BH","BY", "Bonf")))
          {
            par(xpd = F)
            abline(h=-log10(fdr.cutoffs),lty=1:4)
          }

          points(diffExpResults.path$"Log2 fold change", -log10(diffExpResults.path$"P-value"), pch=15, cex = 1.25, col="red") #codecols2[3])

          #legend("bottomright",pch=c(16,16),col=c(codecols2[3]),legend = paste(prb.set.names[k],"pathway genes"))

          sig.and.in.path = diffExpResults.path$"P-value" < 0.01    #(pvals.all<0.01)&(prb.sets[,k]==1)

          #if(sum(sig.and.in.path)>0){text(x=ests[sig.and.in.path,i],y=-log10(pvals[sig.and.in.path,i]),labels=dimnames(ests)[[1]][sig.and.in.path],cex=.8)}

          if(any(sig.and.in.path)){
            text(x = diffExpResults.path[sig.and.in.path,"Log2 fold change"],y = -log10(diffExpResults.path[sig.and.in.path,"P-value"]), labels = diffExpResults.path[sig.and.in.path,"Probe.Label"], cex=.8, font = 2)
          }

          dev.off()
        }
 
        # write a csv of results for just genes in the pathway:
        # select1 = (prb.sets[,k]==1)
#        out = cbind(DEresults$ests[,i],DEresults$ests[,i]-1.96*DEresults$ses[,i],DEresults$ests[,i]+1.96*DEresults$ses[,i],
#                    DEresults$pvals[,i],as.data.frame(DEresults$Pathways))
#        extracolumn=FALSE
#        if(pval.adjustment=="BY")
#        {
#          extracolumn=TRUE
#          pvals.a = p.adjust(pvals[,i],method="BY")
#          out = cbind(out,pvals.a); dimnames(out)[[2]][6] = "FDR"
#        }
#        if(pval.adjustment=="BH")
#        {
#          extracolumn=TRUE
#          pvals.a = p.adjust(pvals[,i],method="BH")
#          out = cbind(out,pvals.a); dimnames(out)[[2]][6] = "FDR"
#        }
#        if(pval.adjustment=="Bonf")
#        {
#          extracolumn=TRUE
#          pvals.a = p.adjust(pvals[,i],method="Bonf")
#          out = cbind(out,pvals.a); dimnames(out)[[2]][6] = "Bonf. p-value"
#        }
#       out = out[select1,]
#       out = out[order(out[,4]),]

        out <- diffExpResults.path          #[,c(1:8)]   paste(toupper(pval.adjustment), ".p.value", sep = "")
        out <- out[order(out[,"P-value"]),]

#        if(extracolumn){out = out[,c(1:4,6,5)]}
#        if(!extracolumn){dimnames(out)[[2]] = c("Log2 fold change","Lower confidence limit","Upper confidence limit","P-value","Pathways")}
#        if(extracolumn){dimnames(out)[[2]] = c("Log2 fold change","Lower confidence limit","Upper confidence limit","P-value",dimnames(out)[[2]][5],"Pathways")}
#        out[,c(1:3)]=signif(out[,c(1:3)],3)
#        out[,4]=signif(out[,4],4)
#        if(extracolumn){out[,5]=signif(out[,5],4)} 
        
#        write.csv(out,file=correct.filename(paste(path.to.GSA.results,"//Genes in ",dimnames(prb.sets)[[2]][k]," - ",make.names(dimnames(DEresults$ests)[[2]][i]),".csv",sep="")))
#        write.csv(out,file=correct.filename(paste(path.to.GSA.results,"//Genes in ",gsub("/", "-", dimnames(prb.sets)[[2]][k], ignore.case = FALSE, perl = F, fixed = T)," - ",make.names(dimnames(DEresults$ests)[[2]][i]),".csv",sep="")))
        
        #Replace probe ID with label:
        #---------------------------
        out$probe.ID <- paste(prb.annots[out$probe.ID,"Probe.Label"],prb.annots[out$probe.ID,"Analyte.Type"],sep = "-")
        colnames(out)[colnames(out)=="probe.ID"] <- "Probe Label"
        write.csv(out,file=correct.filename(paste(path.to.GSA.results,"//Genes in ", make.names(dimnames(prb.sets)[[2]][k])," - ",make.names(dimnames(DEresults$ests)[[2]][i]),".csv",sep="")),row.names = F)

      } #end if(saveresults)
    }
  }
  
  
  if(saveresults)
  {
    
    print("Creating gene set analysis (GSA) results files")
    cat("LOG:Creating gene set analysis (GSA) results files",file=log,sep='\n\n',append=TRUE)
    cat("document.write('<p>Creating gene set analysis (GSA) results files</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
    
    # write global significance statistics:
    write.csv(round(global.sig.stats,3),file=paste(path.to.GSA.results,"//global significance stats - mean abs t-stat.csv",sep=""))
    write.csv(round(global.sig.stats.updown,3),file=paste(path.to.GSA.results,"//global significance stats - mean t-stat.csv",sep=""))
    bothstats = cbind(global.sig.stats,global.sig.stats.updown)
    colnames(bothstats) = paste(c(rep("Undirected",ncol(global.sig.stats)),rep("Directed",ncol(global.sig.stats.updown))),colnames(bothstats))
    write.csv(round(bothstats,3),file=paste(path.to.GSA.results,"//global significance stats - directed and undirected.csv",sep=""))
    
    if(dim(global.sig.stats)[2]>1)
    {
      
      ## heatmaps of path significance scores

      # remove rows for any gene sets without results:
      global.sig.stats = global.sig.stats[!empty.prb.sets,]
      global.sig.stats.updown = global.sig.stats.updown[!empty.prb.sets,]

      # add line breaks to make column titles fit in the margins:
      global.sig.stats.linebreaks = global.sig.stats
      global.sig.stats.updown.linebreaks = global.sig.stats.updown
      dimnames(global.sig.stats.linebreaks)[[2]]=dimnames(global.sig.stats.updown.linebreaks)[[2]]=newtermnames.linebreak[covariates]
      
      for(r in 1:length(plottypearg)){
        
        plottype=plottypearg[r];
        tempfilename = drawplot(paste(path.to.GSA.results,"//heatmap of global significance scores",sep=""),plottype,width=1.7,height=1.7)
        tempfilename=gsub(path.results,"results",tempfilename)
        
        par(mar=c(10,4,2,1))
        hmcols<-colorRampPalette(c("cornflowerblue","black","orange"))(256)
        heatmap.2(global.sig.stats.linebreaks,scale="none",symm=FALSE,trace="none",density.info="none",Colv=TRUE,margin=c(12,12),col=hmcols,
                  #cexCol = 0.2 + 1/log10(dim(global.sig.stats)[1]),
                  cexRow=0.85,cexCol=0.85, keysize = 1, key.xlab = "score")
        dev.off()
        tempfilename = drawplot(paste(path.to.GSA.results,"//heatmap of directed global significance scores - directed",sep=""),plottype,width=1.7,height=1.7)
        tempfilename=gsub(path.results,"results",tempfilename)
        
        par(mar=c(10,4,2,1))
        hmcols<-colorRampPalette(c("blue","antiquewhite3",codecols2[4]))(256)
        breaks = seq(-max(abs(global.sig.stats.updown)),max(abs(global.sig.stats.updown)),length.out=length(hmcols)+1)
        heatmap.2(global.sig.stats.updown.linebreaks,scale="none",symm=FALSE,trace="none",density.info="none",Colv=TRUE,col=hmcols,breaks=breaks,margin=c(12,12),
                  #cexCol = 0.2 + 1/log10(dim(global.sig.stats)[1])
                  cexRow=0.85,cexCol=0.85, keysize = 1, key.xlab = "score")
        dev.off()
      }
    } 
    print("Creating gene set analysis (GSA) HTML infrastructure")
    cat("LOG:Creating gene set analysis (GSA) HTML infrastructure",file=log,sep='\n\n',append=TRUE)
    cat("document.write('<p>Creating gene set analysis (GSA) HTML infrastructure</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
  } #end if(saveresults)
  
  
  out = list(mean.abs.tstats = global.sig.stats,mean.tstats = global.sig.stats.updown)
  print("Finished gene set analysis (GSA)")
  cat("LOG: Finished gene set analysis (GSA)",file=log,sep='\n\n',append=TRUE)
  cat("document.write('<p>Finished gene set analysis (GSA)</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
  return(out)
}
